Packages We Need
library(tidyverse)
library(skimr)
library(naniar)
library(ggplot2)
library(dplyr)
library(maps)
library(flextable)
library(plotly)
library(flexdashboard)
library(lubridate)
library(stringr)
library(kableExtra)
library(highcharter)
library(wordcloud2)
library(RColorBrewer)
library(usmap)
library(leaflet)
library(broom)
library(moments)
set.seed(1994)Loading the Datasets
# Fast Food dataset
fastFoodData <- read_csv("fastFoodGeo.csv")
#Census dataset
censusData <- read_csv("census_data_full_2008-2021.csv")Merging Data Sets
## Joining two datasets
finalData <- inner_join(fastFoodData, censusData, by = "geoid")
finalData <- finalData %>% select(geoid,keys,address,city,country,name,postalCode,province,lat,long,year,population,median_income,median_monthly_rent_cost,median_monthly_home_cost, prop_female,prop_male, prop_poverty)1. Data dictionary
# Creating data dictionary
dataDictionary <- tibble(Variable = colnames(finalData),
Description = c("Geographic Region ID",
"Keys for identification purposes",
"Address of the location",
"City where the location is situated",
"Country of the location",
"Name of the location",
"Postal code of the location",
"Province or state where the location is located",
"Latitude of the location",
"Longitude of the location",
"Year of data collection",
"Population count",
"Median income of the location",
"Median monthly rental cost of housing",
"Median monthly cost of home ownership",
"Proportion of population that is female",
"Proportion of population that is male",
"Proportion of people 25 and older living in poverty"),
Type = map_chr(finalData, .f = function(x){typeof(x)[1]}),
Class = map_chr(finalData, .f = function(x){class(x)[1]}))
#Setting theme for flextable
set_flextable_defaults(
font.size = 10, theme_fun = theme_vanilla,
padding = 6,
background.color = "#EFEFEF")
flextable::flextable(dataDictionary, cwidth = 2)Variable | Description | Type | Class |
|---|---|---|---|
geoid | Geographic Region ID | character | character |
keys | Keys for identification purposes | character | character |
address | Address of the location | character | character |
city | City where the location is situated | character | character |
country | Country of the location | character | character |
name | Name of the location | character | character |
postalCode | Postal code of the location | character | character |
province | Province or state where the location is located | character | character |
lat | Latitude of the location | double | numeric |
long | Longitude of the location | double | numeric |
year | Year of data collection | double | numeric |
population | Population count | double | numeric |
median_income | Median income of the location | double | numeric |
median_monthly_rent_cost | Median monthly rental cost of housing | double | numeric |
median_monthly_home_cost | Median monthly cost of home ownership | double | numeric |
prop_female | Proportion of population that is female | double | numeric |
prop_male | Proportion of population that is male | double | numeric |
prop_poverty | Proportion of people 25 and older living in poverty | double | numeric |
2. Data Cleaning
a. Merging Datasets
Already did above
b. Date/Time Manipulation
lubridate is primarily designed for handling dates and times, it can also be used for simple year manipulation by converting the year into a complete date object with a fixed month and day.
finalData <- finalData %>% mutate(Date = make_datetime(year = year, month = 01, day = 01))
head(finalData$Date)## [1] "2008-01-01 UTC" "2009-01-01 UTC" "2010-01-01 UTC" "2011-01-01 UTC"
## [5] "2012-01-01 UTC" "2013-01-01 UTC"
c. String Manipulation
#1. Converting Short From States to Long Form States
# Mapping of short form to long form state names
state_mapping <- c("NY" = "New York", "OH" = "Ohio", "KY" = "Kentucky", "SC" = "South Carolina", "AR" = "Arkansas", "OK" = "Oklahoma", "IN" = "Indiana", "NC" = "North Carolina", "TN" = "Tennessee", "TX" = "Texas", "LA" = "Louisiana", "KS" = "Kansas", "ND" = "North Dakota", "UT" = "Utah", "GA" = "Georgia", "NM" = "New Mexico", "OR" = "Oregon", "HI" = "Hawaii", "VT" = "Vermont", "MI" = "Michigan", "MO" = "Missouri", "WI" = "Wisconsin", "WA" = "Washington", "MS" = "Mississippi", "NE" = "Nebraska", "ME" = "Maine", "MN" = "Minnesota", "AL" = "Alabama", "IA" = "Iowa", "WV" = "West Virginia", "AZ" = "Arizona", "SD" = "South Dakota", "WY" = "Wyoming", "IL" = "Illinois", "VA" = "Virginia", "FL" = "Florida", "CA" = "California", "MT" = "Montana", "ID" = "Idaho", "PA" = "Pennsylvania", "RI" = "Rhode Island", "NV" = "Nevada", "NJ" = "New Jersey", "MA" = "Massachusetts", "MD" = "Maryland", "CO" = "Colorado", "NH" = "New Hampshire", "CT" = "Connecticut", "AK" = "Alaska", "DE" = "Delaware", "DC" = "District of Columbia", "Co Spgs" = "Colorado Springs")
# Convert short form to long form in the finalData dataset
finalData$province <- str_replace_all(finalData$province, state_mapping)#2. Update "Colorado Springs" to "Colorado" in the province variable
finalData$province <- str_replace(finalData$province, "Colorado Springs", "Colorado")#3. Handling Duplicates for vendor names
# Replace "&" with "AND"
finalData$name <- str_replace(finalData$name, "&", "AND")
# Remove all non-alphanumeric characters
finalData$name <- str_replace_all(finalData$name, "\\W", "")
# Remove specific characters
finalData$name <- str_replace_all(finalData$name, "Æ", "")
# Convert to uppercase
finalData$name <- str_to_upper(finalData$name)
# Handle duplicates
finalData$name <- str_replace(finalData$name, "MCDONALDS.*", "MCDONALDS")
finalData$name <- str_replace(finalData$name, "LONGJOHNSILVERSA.*", "LONGJOHNSILVERSAANDW")
finalData$name <- str_replace(finalData$name, "AANDWLONGJOHNSILVERS", "LONGJOHNSILVERSAANDW")
finalData$name <- str_replace(finalData$name, "ARBYS.*", "ARBYS")
finalData$name <- str_replace(finalData$name, "ZAXBYS.*", "ZAXBYS")
finalData$name <- str_replace(finalData$name, "WENDYS.*", "WENDYS")
finalData$name <- str_replace(finalData$name, "QUIZNOS.*", "QUIZNOS")
finalData$name <- str_replace(finalData$name, "POPEYES.*", "POPEYES")
finalData$name <- str_replace(finalData$name, "PANDAEXPRESS.*", "PANDAEXPRESS")
finalData$name <- str_replace(finalData$name, "LITTLECAESARS.*", "LITTLECAESARS")
finalData$name <- str_replace(finalData$name, "KFCK.*", "KFC")
finalData$name <- str_replace(finalData$name, "FIVEGUYS.*", "FIVEGUYS")
finalData$name <- str_replace(finalData$name, "DAIRYQUEEN.*", "DAIRYQUEEN")
finalData$name <- str_replace(finalData$name, "CHIPOTLE.*", "CHIPOTLE")
finalData$name <- str_replace(finalData$name, "AUNTIEANNES.*", "AUNTIEANNES")
finalData$name <- str_replace(finalData$name, "CARLSJRTHE.*", "CARLSJRGREENBURRITO")3. Exploratory Data Analysis
skim(finalData)| Name | finalData |
| Number of rows | 229301 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 10 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| geoid | 0 | 1 | 5 | 5 | 0 | 1690 | 0 |
| keys | 0 | 1 | 26 | 109 | 0 | 9946 | 0 |
| address | 0 | 1 | 5 | 60 | 0 | 9880 | 0 |
| city | 0 | 1 | 3 | 23 | 0 | 2759 | 0 |
| country | 0 | 1 | 2 | 2 | 0 | 1 | 0 |
| name | 0 | 1 | 2 | 41 | 0 | 457 | 0 |
| postalCode | 0 | 1 | 4 | 10 | 0 | 5250 | 0 |
| province | 0 | 1 | 4 | 20 | 0 | 51 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| lat | 0 | 1 | 37.49 | 5.05 | 19.65 | 33.96 | 38.34 | 41.14 | 64.84 | ▁▆▇▁▁ |
| long | 0 | 1 | -92.01 | 15.11 | -159.38 | -97.66 | -87.59 | -81.34 | -67.45 | ▁▁▃▆▇ |
| year | 0 | 1 | 2014.60 | 3.82 | 2008.00 | 2011.00 | 2015.00 | 2018.00 | 2021.00 | ▆▇▅▇▇ |
| population | 0 | 1 | 905770.07 | 1677137.71 | 3688.00 | 112060.00 | 320095.00 | 870893.00 | 10170292.00 | ▇▁▁▁▁ |
| median_income | 5 | 1 | 56961.81 | 15354.12 | 18972.00 | 46356.00 | 54053.00 | 64468.00 | 156821.00 | ▃▇▂▁▁ |
| median_monthly_rent_cost | 0 | 1 | 925.95 | 273.35 | 245.00 | 729.00 | 869.00 | 1063.00 | 2599.00 | ▃▇▂▁▁ |
| median_monthly_home_cost | 0 | 1 | 1162.50 | 426.09 | 222.00 | 862.00 | 1088.00 | 1401.00 | 3254.00 | ▃▇▃▁▁ |
| prop_female | 0 | 1 | 0.51 | 0.01 | 0.32 | 0.50 | 0.51 | 0.52 | 0.63 | ▁▁▅▇▁ |
| prop_male | 0 | 1 | 0.49 | 0.01 | 0.37 | 0.48 | 0.49 | 0.50 | 0.68 | ▁▇▅▁▁ |
| prop_poverty | 8 | 1 | 0.15 | 0.05 | 0.02 | 0.11 | 0.14 | 0.17 | 0.47 | ▃▇▂▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Date | 0 | 1 | 2008-01-01 | 2021-01-01 | 2015-01-01 | 14 |
#Out of Total 229301 observations
#5 median_income are missing - Mainly because it was was not recorded or not available for those specific locations.
#8 prop_poverty are missing - Mainly because it was was not recorded or not available for those specific locations.
#My Observation:
#Missing data can impact summary statistics by reducing sample sizes and potentially biasing the results. Visualizations may also be affected if missing values are not handled appropriately, potentially leading to incomplete or inaccurate representations of the data.a. Tables of Summary Statistics
1. Summary statistics of median_income by province
# Calculate the summary statistics of median_income by province
summary_stats_income <- finalData %>%
group_by(province) %>%
summarise(mean_median_income = mean(median_income),
median_median_income = median(median_income),
sd_median_income = sd(median_income),
min_median_income = min(median_income),
max_median_income = max(median_income))
# Keep only the 5 rows with the maximum values of 'mean_median_income' for each province
summary_stats_income_5 <- summary_stats_income %>%
slice_max(n = 5, order_by = mean_median_income)
# Round the summary statistics to two decimal places
summary_stats_income_5 <- summary_stats_income_5 %>%
mutate(across(starts_with("mean_"):starts_with("max_"), ~round(., 2)))
# Set the caption
caption_text <- "Table 1: Summary Statistics of Median Income by Province"
# Print the table with caption using knitr::kable()
knitr::kable(summary_stats_income_5, caption = caption_text, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| province | mean_median_income | median_median_income | sd_median_income | min_median_income | max_median_income |
|---|---|---|---|---|---|
| Maryland | 80214.32 | 81240 | 20091.50 | 35039 | 133267 |
| New Jersey | 75715.26 | 76443 | 15681.09 | 45339 | 124764 |
| Alaska | 75498.80 | 76250 | 7043.59 | 55966 | 90749 |
| Massachusetts | 74418.29 | 74698 | 16882.57 | 42290 | 116571 |
| Hawaii | 74139.45 | 73388 | 9600.96 | 46479 | 92600 |
2. Summary statistics of population by province
# Calculate the summary statistics of population by province
summary_stats_population <- finalData %>%
group_by(province) %>%
summarise(mean_population = round(mean(population), 0),
median_population = round(median(population), 0),
sd_population = round(sd(population), 0),
min_population = round(min(population), 0),
max_population = round(max(population), 0))
# Keep only the 5 rows with the maximum values of 'mean_population' for each province
summary_stats_population_5 <- summary_stats_population %>%
slice_max(n = 5, order_by = mean_population)
# Set the caption
caption_text_population <- "Table 2: Summary Statistics of Population by Province"
# Print the table with caption using knitr::kable()
kable(summary_stats_population_5, caption = caption_text_population, align = "l")%>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| province | mean_population | median_population | sd_population | min_population | max_population |
|---|---|---|---|---|---|
| California | 4043664 | 2323892 | 3866682 | 12925 | 10170292 |
| Arizona | 2916351 | 3889161 | 1749812 | 16845 | 4496588 |
| Illinois | 1711499 | 310749 | 2216082 | 11390 | 5294664 |
| Nevada | 1521362 | 1969975 | 842850 | 15986 | 2292476 |
| Texas | 1461132 | 815996 | 1532290 | 6153 | 4728030 |
3. Summary statistics of the number of fast food restaurants by province:
# Calculate the summary statistics of the number of fast food restaurants by province
summary_stats_restaurants <- finalData %>%
group_by(province) %>%
summarise(total_restaurants = n())
# Keep only the rows with the maximum values of 'total_restaurants'
summary_stats_restaurants_10 <- summary_stats_restaurants %>%
slice_max(n = 10, order_by = total_restaurants)
# Rename the columns in the summary statistics table
new_column_names_restaurants <- c("Province", "Total Restaurants")
# Set the new column names
colnames(summary_stats_restaurants_10) <- new_column_names_restaurants
# Set the caption
caption_text_restaurants <- "Table 3: Summary Statistics of Fast Food Restaurants by Province"
# Print the table with caption using knitr::kable()
kable(summary_stats_restaurants_10, caption = caption_text_restaurants, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Province | Total Restaurants |
|---|---|
| California | 17476 |
| Texas | 14764 |
| Ohio | 13115 |
| Florida | 12155 |
| Illinois | 8278 |
| North Carolina | 8046 |
| Georgia | 7957 |
| Indiana | 7847 |
| Missouri | 7257 |
| Pennsylvania | 6969 |
#This code calculates the total number of fast food restaurants in each province by using the n() function within the summarise() function. The resulting table, summary_stats_restaurants, provides an overview of the distribution of fast food restaurants across different provinces.4. Summary statistics of the average prop_male and prop_female in each province
# Calculate the summary statistics of the average prop_male and prop_female in each province
summary_stats_gender <- finalData %>%
group_by(province) %>%
summarise(mean_prop_male = round(mean(prop_male), 2),
mean_prop_female = round(mean(prop_female), 2))
# Rename the columns in the summary statistics table
new_column_names_gender <- c("Province", "Mean Proportion Male", "Mean Proportion Female")
# Set the new column names
colnames(summary_stats_gender) <- new_column_names_gender
# Set the caption
caption_text_gender <- "Table 4: Summary Statistics of Average Proportions of Male and Female in Each Province"
# Print the table with caption using knitr::kable()
kable(head(summary_stats_gender), caption = caption_text_gender, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Province | Mean Proportion Male | Mean Proportion Female |
|---|---|---|
| Alabama | 0.48 | 0.52 |
| Alaska | 0.52 | 0.48 |
| Arizona | 0.50 | 0.50 |
| Arkansas | 0.49 | 0.51 |
| California | 0.50 | 0.50 |
| Colorado | 0.50 | 0.50 |
5. Summary statistics of prop_poverty by province
#The code calculates summary statistics (mean, median, standard deviation, minimum, maximum) of the proportion of people living in poverty (prop_poverty) grouped by province. The resulting table, summary_stats_poverty, provides insights into the poverty levels across different provinces.
# Calculate the summary statistics of prop_poverty by province
summary_stats_poverty <- finalData %>%
group_by(province) %>%
summarise(mean_prop_poverty = round(mean(prop_poverty), 2),
median_prop_poverty = round(median(prop_poverty), 2),
sd_prop_poverty = round(sd(prop_poverty), 2),
min_prop_poverty = round(min(prop_poverty), 2),
max_prop_poverty = round(max(prop_poverty), 2))
# Rename the columns in the summary statistics table
new_column_names_poverty_province <- c("Province", "Mean Proportion Poverty", "Median Proportion Poverty", "St. Deviation Poverty", "Minimum Proportion Poverty", "Maximum Proportion Poverty")
# Set the new column names
colnames(summary_stats_poverty) <- new_column_names_poverty_province
# Set the caption
caption_text_poverty <- "Table 5: Summary Statistics of Proportion of People Living in Poverty by Province"
# Print the table with caption using knitr::kable()
kable(head(summary_stats_poverty, 5), caption = caption_text_poverty, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Province | Mean Proportion Poverty | Median Proportion Poverty | St. Deviation Poverty | Minimum Proportion Poverty | Maximum Proportion Poverty |
|---|---|---|---|---|---|
| Alabama | 0.17 | 0.18 | 0.05 | 0.04 | 0.36 |
| Alaska | 0.08 | 0.08 | 0.01 | 0.06 | 0.13 |
| Arizona | 0.16 | 0.16 | 0.03 | 0.10 | 0.33 |
| Arkansas | 0.18 | 0.17 | 0.04 | 0.07 | 0.35 |
| California | 0.15 | 0.14 | 0.04 | 0.05 | 0.30 |
6. Summary statistics of median_monthly_rent_cost by province
#The code calculates summary statistics (mean, median, standard deviation, minimum, maximum) of the median monthly rental cost (median_monthly_rent_cost) grouped by province. The resulting table, summary_stats_rent, provides insights into the rental cost variations across different provinces.
# Calculate the summary statistics of median_monthly_rent_cost by province
summary_stats_rent <- finalData %>%
group_by(province) %>%
summarise(mean_rent_cost = round(mean(median_monthly_rent_cost), 2),
median_rent_cost = round(median(median_monthly_rent_cost), 2),
sd_rent_cost = round(sd(median_monthly_rent_cost), 2),
min_rent_cost = round(min(median_monthly_rent_cost), 2),
max_rent_cost = round(max(median_monthly_rent_cost), 2))
# Rename the columns in the summary statistics table
new_column_names_rent_cost <- c("Province", "Mean Rent Cost", "Median Rent Cost", "St. Deviation Rent Cost", "Minimum Rent Cost", "Maximum Rent Cost")
# Set the new column names
colnames(summary_stats_rent) <- new_column_names_rent_cost
# Set the caption
caption_text_rent <- "Table 6: Summary Statistics of Median Monthly Rental Cost by Province"
# Print the table with caption using knitr::kable()
kable(head(summary_stats_rent, 5), caption = caption_text_rent, align = "l") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Province | Mean Rent Cost | Median Rent Cost | St. Deviation Rent Cost | Minimum Rent Cost | Maximum Rent Cost |
|---|---|---|---|---|---|
| Alabama | 763.08 | 768 | 130.05 | 367 | 1194 |
| Alaska | 1166.21 | 1197 | 128.91 | 770 | 1350 |
| Arizona | 963.05 | 943 | 152.64 | 502 | 1389 |
| Arkansas | 712.41 | 713 | 106.91 | 386 | 1053 |
| California | 1335.34 | 1272 | 302.02 | 646 | 2599 |
b. Data Visualization
1. Bar Plot: Comparison of Median Income by Province
# Calculate the median income by province
median_income_by_province <- finalData %>%
group_by(province) %>%
summarise(median_income = median(median_income))
# Sort the data by median income in descending order
median_income_by_province <- median_income_by_province %>%
arrange(desc(median_income))
# Create an interactive bar plot to compare median income by province
plot_median_income <- ggplot(median_income_by_province, aes(x = reorder(province, -median_income), y = median_income)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Comparison of Median Income by Province",
x = "Province",
y = "Median Income (USD)",
caption = "Data Source: FinalData") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert the plot to an interactive plot using plotly
interactive_plot_income_median <- ggplotly(plot_median_income)
# Display the interactive plot
interactive_plot_income_median# This bar plot compares the median income across different provinces. The provinces are ordered based on their median income in descending order. The x-axis represents the provinces, and the y-axis represents the median income in USD. The bars are filled with a steel blue color. The plot has an appropriate title, axis labels with units of measurement, and a caption indicating the data source. The x-axis labels are angled for better readability.2. Word Cloud: Fast Food Leading Brands in the US
# Filter the data for fast food leading brands
leading_brands <- subset(finalData, !is.na(name))$name
# Generate the word frequencies
word_freq <- table(leading_brands)
# Convert the word frequencies to a data frame
word_df <- data.frame(word = names(word_freq), freq = as.numeric(word_freq))
# Sort the data frame by frequency in descending order
word_df <- word_df[order(word_df$freq, decreasing = TRUE), ]
# Limit the number of words to 100
word_df <- head(word_df, 50)
# Create the interactive word cloud
wordcloud2(word_df, size = 1.5, color = "random-dark", backgroundColor = "white")3. Bar Plot: Top 10 Cities in the US with High Fast Food Rate
# Filter the data for fast food restaurants
fast_food_data <- finalData %>%
filter(!is.na(city))
# Calculate the number of fast food restaurants in each city
city_counts <- fast_food_data %>%
count(city)
# Sort the cities by the number of fast food restaurants in descending order
top_cities <- city_counts %>%
arrange(desc(n)) %>%
head(10)
# Create a bar plot of the top 10 cities with the highest fast food rate
bar_plot <- ggplot(top_cities, aes(x = reorder(city, -n), y = n, fill = n)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Number of Restaurants") +
labs(title = "Top 10 Cities with High Fast Food Rate",
x = "City",
y = "Number of Fast Food Restaurants",
caption = "Data Source: FinalData") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert the plot to an interactive plot using plotly
interactive_plot <- ggplotly(bar_plot)
# Display the interactive plot
interactive_plot4. Line Plot: Trend of Fast Food Restaurants Over the Years
# Calculate the total number of fast food restaurants by year
restaurant_trend <- finalData %>%
group_by(year) %>%
summarise(total_restaurants = n())
# Create an interactive line plot to show the trend of fast food restaurants over the years
plot_restaurant_trend <- ggplot(restaurant_trend, aes(x = year, y = total_restaurants)) +
geom_line(color = "steelblue", size = 1.5) +
labs(title = "Trend of Fast Food Restaurants Over the Years",
x = "Year",
y = "Total Restaurants",
caption = "Data Source: FinalData") +
theme_bw()
# Convert the plot to an interactive plot using plotly
interactive_plot_restaurant_trend <- ggplotly(plot_restaurant_trend)
# Display the interactive plot
interactive_plot_restaurant_trend5. Pie Chart:Proportions of different fast food restaurants in top 3 cities “Cincinnati”, “Las Vegas”, “Houston”
a. For Cicinnati
# Filter the data for Cincinnati
cincinnati_restaurants <- finalData %>%
filter(city == "Cincinnati")
# Calculate the proportions of different fast food restaurants in Cincinnati
cincinnati_proportions <- cincinnati_restaurants %>%
group_by(name) %>%
summarise(proportion = n()) %>%
mutate(proportion = proportion / sum(proportion))
# Create an interactive pie chart for Cincinnati without legend
interactive_pie_cincinnati <- plot_ly(cincinnati_proportions, labels = cincinnati_proportions$name, values = cincinnati_proportions$proportion, type = "pie",
textposition = "outside", insidetextfont = list(color = "#FFFFFF"), hoverinfo = "label+percent",
marker = list(colors = rainbow(length(cincinnati_proportions$name)), line = list(color = "#FFFFFF", width = 2))) %>%
layout(title = "Proportions of Different Fast Food Restaurants in Cincinnati",
showlegend = FALSE,
annotations = list(text = "Data Source: FinalData", x = 0.5, y = -0.2, showarrow = FALSE))
# Display the interactive pie chart without legend
interactive_pie_cincinnatib. For Las Vegas
# Filter the data for Las Vegas
lasvegas_restaurants <- finalData %>%
filter(city == "Las Vegas")
# Calculate the proportions of different fast food restaurants in Las Vegas
lasvegas_proportions <- lasvegas_restaurants %>%
group_by(name) %>%
summarise(proportion = n()) %>%
mutate(proportion = proportion / sum(proportion))
# Create an interactive pie chart for las vegas without legend
interactive_pie_lasvegas <- plot_ly(lasvegas_proportions, labels = lasvegas_proportions$name, values = lasvegas_proportions$proportion, type = "pie",
textposition = "outside", insidetextfont = list(color = "#FFFFFF"), hoverinfo = "label+percent",
marker = list(colors = rainbow(length(lasvegas_proportions$name)), line = list(color = "#FFFFFF", width = 2))) %>%
layout(title = "Proportions of Different Fast Food Restaurants in Las Vegas",
showlegend = FALSE,
annotations = list(text = "Data Source: FinalData", x = 0.5, y = -0.2, showarrow = FALSE))
# Display the interactive pie chart without legend
interactive_pie_lasvegasc. For Houston
# Filter the data for Houston
houston_restaurants <- finalData %>%
filter(city == "Houston")
# Calculate the proportions of different fast food restaurants in Houston
houston_proportions <- houston_restaurants %>%
group_by(name) %>%
summarise(proportion = n()) %>%
mutate(proportion = proportion / sum(proportion))
# Create an interactive pie chart for las vegas without legend
interactive_pie_houston <- plot_ly(houston_proportions, labels = houston_proportions$name, values = houston_proportions$proportion, type = "pie",
textposition = "outside", insidetextfont = list(color = "#FFFFFF"), hoverinfo = "label+percent",
marker = list(colors = rainbow(length(houston_proportions$name)), line = list(color = "#FFFFFF", width = 2))) %>%
layout(title = "Proportions of Different Fast Food Restaurants in Houston",
showlegend = FALSE,
annotations = list(text = "Data Source: FinalData", x = 0.5, y = -0.2, showarrow = FALSE))
# Display the interactive pie chart without legend
interactive_pie_houston6. Box Plot: Distribution of Median Income by Province
# Create the box plot with rotated x-axis labels
plot_median_income <- ggplot(finalData, aes(x = province, y = median_income)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Distribution of Median Income by Province",
x = "Province",
y = "Median Income",
caption = "Data Source: FinalData") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert the plot to an interactive plot using plotly
interactive_plot_median_income <- ggplotly(plot_median_income)
# Display the interactive plot
interactive_plot_median_income7. Scatter Plot: Relationship between Poverty and Fast Food Restaurants
# Calculate the proportion of people in poverty by province
poverty_data <- finalData %>%
group_by(province) %>%
summarise(prop_poverty = mean(prop_poverty),
total_restaurants = n_distinct(name))
# Create an interactive scatter plot
scatter_plot <- ggplot(poverty_data, aes(x = prop_poverty, y = total_restaurants)) +
geom_point(color = "steelblue") +
labs(title = "Relationship between Poverty and Fast Food Restaurants",
x = "Proportion of People in Poverty",
y = "Total Number of Fast Food Restaurants",
caption = "Data Source: FinalData") +
theme_bw()
# Convert the plot to an interactive plot using plotly
interactive_plot_poverty <- ggplotly(scatter_plot)
# Display the interactive plot
interactive_plot_poverty8. USMAP: Number of Fast Food Restaurants per Province
# Calculate the total number of fast food restaurants per province
restaurants_per_province <- finalData %>%
group_by(province) %>%
summarise(total_restaurants = n_distinct(name))
# Convert the province names to state abbreviations
state_abbreviations <- state.abb[match(restaurants_per_province$province, state.name)]
restaurants_per_province$state <- state_abbreviations
# Add state names to the data frame
restaurants_per_province$state_name <- state.name[match(restaurants_per_province$state, state.abb)]
# Create the plotly object
plotly_map <- plot_usmap(data = restaurants_per_province, values = "total_restaurants", color = "black") +
labs(title = "Number of Fast Food Restaurants per Province",
fill = "Total Restaurants",
caption = "Data Source: FinalData") +
scale_fill_continuous(name = "Total Restaurants") +
theme(legend.position = "right")
# Convert the plot to an interactive plot using plotly
interactive_plot_usmap <- ggplotly(plotly_map, tooltip = c("state_name", "total_restaurants"),
text = c("state_name", "total_restaurants"))
# Display the interactive plot
interactive_plot_usmap4. Monte Carlo Methods of Inference
## A Monte Carlo simulation using the One-Way ANOVA model
## Null Hypothesis (H0): There is no difference in the average number of locations for the different restaurant names across the states in the United States.
## Alternative Hypothesis (Ha):There is a difference in the average number of locations for the different restaurant names across the states in the United States.
set.seed(1994)
# Step 1: Create a new data frame with counts of restaurant locations in each state for each restaurant name
restaurant_counts <- finalData %>%
group_by(name, province) %>%
summarise(num_locations = n()) %>%
ungroup()
# Step 2: Subset to restaurants with a sufficiently large number of locations in every state
min_locations_per_state <- 5 # Set the minimum number of locations per state
restaurant_counts_subset <- restaurant_counts %>%
group_by(name) %>%
filter(all(num_locations >= min_locations_per_state)) %>%
ungroup()
# Step 3: Perform One-Way ANOVA
anova_result <- aov(num_locations ~ name, data = restaurant_counts_subset)
# Step 4: Extract the F-statistic from the ANOVA result
f_statistic <- summary(anova_result)[[1]]$`F value`
# Step 5: Create a null distribution of F-statistics using Monte Carlo simulation
num_permutations <- 1000 # Increase the number of permutations
null_distribution <- numeric(num_permutations)
for (i in 1:num_permutations) {
# Step 5a: Shuffle the num_locations randomly between different restaurant names
shuffled_counts <- restaurant_counts %>%
mutate(name_shuffled = sample(name)) %>%
group_by(name_shuffled, province) %>%
summarise(num_locations_shuffled = sum(num_locations)) %>%
ungroup()
# Step 5b: Perform One-Way ANOVA for the shuffled data and extract the F-statistic
null_anova_result <- aov(num_locations_shuffled ~ name_shuffled, data = shuffled_counts)
# Step 5c: Extract the F-statistic from the shuffled ANOVA result
null_f_statistic <- summary(null_anova_result)[[1]]$`F value`
# Step 5d: Check if the F-statistic is present in the summary (in case it's not available for some permutations)
if (!is.null(null_f_statistic)) {
null_distribution[i] <- null_f_statistic
}
}
# Step 6: Remove any NA values from the null distribution
null_distribution <- null_distribution[!is.na(null_distribution)]
# Calculate the 5th percentile of the null distribution
null_5th_percentile <- quantile(null_distribution, 0.05)
# Step 7: Plot the null distribution of F-statistics
comparison_plot_f <- ggplot(data.frame(F_statistic = null_distribution), aes(x = F_statistic)) +
geom_histogram(binwidth = 0.2, fill = alpha("lightblue", 0.7), color = "black") +
geom_vline(xintercept = null_5th_percentile, color = "red", linetype = "dashed") +
geom_vline(xintercept = f_statistic, color = "blue", linetype = "solid") +
labs(x = "F-statistic",
y = "Frequency",
title = "Null Distribution of the F-statistic in One-Way ANOVA") +
theme_bw()
# Convert the plot to an interactive plot using plotly
interactive_plot_one_way <- ggplotly(comparison_plot_f)
# Display the interactive plot
interactive_plot_one_wayInterpretation:
The red dashed vertical line represents the 5th percentile value of the null distribution of F-statistics.
This value is the threshold below which only 5% of the F-statistics from the random permutations fall.
It serves as the critical value for determining statistical significance.
The blue solid vertical line represents the observed F-statistic obtained from the original data.
This value is calculated from the One-Way ANOVA comparing the average number of locations for different
restaurant names across the states.
Comparing the observed F-statistic to the critical value at the 5th percentile, we draw the following conclusions:
- The observed F-statistic is greater than the critical value.
- The observed F-statistic falls in the right tail of the null distribution.
Therefore, based on the observed F-statistic and its comparison with the 5th percentile of the null distribution,
we reject the null hypothesis and conclude that there are differences in the average number of locations
for different restaurant names across the states in the United States.
5. Bootstrap Methods of Inference
# Problem Statement:
# The objective of this analysis is to estimate the median income for a specific dataset ("finalData") using the bootstrap method of inference. We want to determine the uncertainty associated with the median income estimate and construct a 95% confidence interval for the true median income. Additionally, we aim to visualize the distribution of bootstrap medians and compare it to the observed median income to assess the representatives of the observed estimate.
#Removing missing values from median_income
median_income_cleaned <- na.omit(finalData$median_income)
#Calculating the median of a bootstrap sample
bootstrap_median <- function(x) {median(sample(x, replace = TRUE))}
#setting seed
set.seed(123456)
#Generating bootstrap samples while calculating the median of them
bootstrap_median <- replicate(10000, bootstrap_median(median_income_cleaned))
#Calculating the standard error
SE <- sd(bootstrap_median)
#Calculating the 95% confidence intervals
conf_interval <- 0.95
lower_bound <- quantile(bootstrap_median, probs = (1 - conf_interval )/ 2)
upper_bound <- quantile(bootstrap_median, probs = 1 - (1 - conf_interval)/ 2)
#Generating Histogram of the distribution
data_frame <- data.frame(bootstrap_median)
hist_plot <- ggplot(data_frame, aes(x = bootstrap_median)) +
geom_histogram(color = "black", fill = "dodgerblue") +
geom_vline(xintercept = lower_bound, linetype = "dashed", color = "red", size = 1.5) +
geom_vline(xintercept = upper_bound, linetype = "dashed", color = "red", size = 1.5) +
geom_vline(xintercept = median(median_income_cleaned), color = "black", linetype = "dashed", size = 1) +
scale_x_continuous(labels = scales::comma) +
labs(x = "Median income",
y = "Frequency",
title = "Median Income Bootstrap Sample",
subtitle = "Observed median income--dashed line")
# Convert the ggplot histogram to an interactive plot using plotly
interactive_bootstrap <- ggplotly(hist_plot)
# Display the interactive plot
interactive_bootstrap